In [ ]:
##### @Mike 
# Can you add a few sentences about how you got the data (i.e. the struggles you went through)
In [1]:
# Import necessary modules
import os
import glob

import pandas as pd
import statsmodels.formula.api as smf

import plotly.graph_objects as go # for interactive plot with human defined labels
import plotly.express as px
In [2]:
# Load necessary data
path = "/Users/ZXue/src/speeding-up-sci-correlation/"
df = pd.read_table(os.path.join(path, "TARA_025-merged-only-those-in-both-TPM.tsv"))
df.head() # a look at the sample
Out[2]:
Gene DNA RNA
0 TOBG-MED-1076_1101 7.280325 15.477084
1 TOBG-MED-1076_1131 9.169875 1.856575
2 TOBG-MED-1076_1195 4.482290 13.612574
3 TOBG-MED-1076_1248 2.774258 3.370134
4 TOBG-MED-1076_1298 5.946800 3.612049
In [3]:
## Add KO ID to df
df_anno = pd.read_table(os.path.join(path, "Our-Kegg-annotations.tsv"))
df_anno.head()

df = df.merge(df_anno, on='Gene')
# remove genes that are not annotated with a KO ID
df.dropna()
Out[3]:
Gene DNA RNA KO_ID
0 TOBG-MED-1076_1101 7.280325 15.477084 K01940
1 TOBG-MED-1076_1131 9.169875 1.856575 K01696
4 TOBG-MED-1076_1298 5.946800 3.612049 K00347
8 TOBG-MED-1076_252 37.801689 15.307006 K02115
9 TOBG-MED-1076_253 11.870095 10.093750 K02111
... ... ... ... ...
19917 TOBG-MED-831_959 3.211908 9.754464 K00005
19918 TOBG-MED-831_960 7.000313 36.566736 K03696
19919 TOBG-MED-831_978 1.691907 1.027653 K03455
19920 TOBG-MED-831_98 1.708753 1.037886 K08309
19921 TOBG-MED-831_984 3.945004 3.594256 K05592

10955 rows × 4 columns

In [4]:
## Add pathway info to df
df_path = pd.read_table(os.path.join(path, "Kegg-Orthology-table.tsv"))
df_path.head()

df = df.merge(df_path, on='KO_ID')
df.head()
Out[4]:
Gene DNA RNA KO_ID Category1 Category2 Category3 description
0 TOBG-MED-1076_1101 7.280325 15.477084 K01940 Metabolism Overview 01230 Biosynthesis of amino acids [PATH:ko01230] argG, ASS1; argininosuccinate synthase [EC:6....
1 TOBG-MED-1076_1101 7.280325 15.477084 K01940 Metabolism Amino acid metabolism 00250 Alanine, aspartate and glutamate metabol... argG, ASS1; argininosuccinate synthase [EC:6....
2 TOBG-MED-1076_1101 7.280325 15.477084 K01940 Metabolism Amino acid metabolism 00220 Arginine biosynthesis [PATH:ko00220] argG, ASS1; argininosuccinate synthase [EC:6....
3 TOBG-MED-1076_1101 7.280325 15.477084 K01940 Human Diseases Cardiovascular diseases 05418 Fluid shear stress and atherosclerosis [... argG, ASS1; argininosuccinate synthase [EC:6....
4 TOBG-MED-590_1583 26.498243 30.401414 K01940 Metabolism Overview 01230 Biosynthesis of amino acids [PATH:ko01230] argG, ASS1; argininosuccinate synthase [EC:6....
In [5]:
# Make the figure with scatter dots
tracedot = go.Scatter(x=df['DNA'], # User should change this based on dataframe columns
                      y=df['RNA'], # User defined 
                      mode='markers', # for scatter plot
                      text=df['KO_ID'], # hover text
                      #color=df['Category1'],
                      name='Data') # User defined
In [6]:
# Initialise and fit linear regression model using `statsmodels`
### tutorial https://towardsdatascience.com/introduction-to-linear-regression-in-python-c12a072bedf0
model = smf.ols('RNA ~ DNA', data=df)
model = model.fit()
# get linear regression parameters
model.params
Out[6]:
Intercept    11.028804
DNA           0.495884
dtype: float64
In [7]:
# get linear regression R^2 
model.rsquared
Out[7]:
0.02598684167801979
In [8]:
# get linear regression p-value
model.pvalues
Out[8]:
Intercept     7.292660e-07
DNA          8.397350e-103
dtype: float64
In [9]:
# linear regression equation
# line= 0.495884*xi+11.028804
max_val = df['DNA'].max()
df2 = [[0, 11.028804],[max_val, 0.495884*max_val+11.028804]]
df2 = pd.DataFrame(df2, columns = ['xi', 'y']) 
 

# Make the line graph 
trace_ln = go.Scatter(
                  x=df2['xi'],
                  y=df2['y'],
                  mode='lines',
                  name='Fit'
                  )
In [10]:
annotation = go.layout.Annotation(
                  x=500,
                  y=15000,
                  text='$R^2 = 0.025,\\y = 0.495884x + 11.028804$',
                  showarrow=False,
                  font=go.Font(size=16)
                  )
layout = go.Layout(annotations=[annotation])


# Tie everything together to a figure
fig = go.Figure([tracedot,trace_ln], layout=layout)


# Edit the figure layout 
fig.update_layout(title='Gene correlations between metagenomics and metatranscriptomic resutls',
                  xaxis_title='Count (Metagenomics)',
                  yaxis_title='Count (Metatranscriptomics)')

fig.show()
/Users/ZXue/miniconda3/lib/python3.7/site-packages/plotly/graph_objs/_deprecations.py:329: DeprecationWarning:

plotly.graph_objs.Font is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.Font
  - plotly.graph_objs.layout.hoverlabel.Font
  - etc.


In [11]:
# go trace plotting can not set color to dots by category 
## so I am now trying to set color with px plotting
## and add line and text through adding trace
figdot = px.scatter(df, x="DNA", y="RNA", color="Category1", hover_data=['KO_ID'])

# a "dummy" scatter plot trace that is used to carry the text 
trace_txt = go.Scatter(   
    x=[700],
    y=[15000],
    mode='markers+text',
    text=['$R^2 = 0.025,\\y = 0.495884x + 11.028804$'],
    marker=dict(size = 1),
    name='linear regression model',
    textposition='bottom center'
)


figdot.add_trace(trace_ln).add_trace(trace_txt)
#go.Figure(figdot, layout=layout) # why is the text in layout settings not showing up?!
figdot.show()
In [19]:
'''An example of interesting results: K01574 was expressed at very high level in metatrancriptomic results but its "copy number"
is relatively low by metagenomic measurements. '''

'''This KO ID is linked to both carbohydrate and lipid metabolism, suggesting robust microbial activities
within in the sampled Tara oceam specimen.'''

df[df['KO_ID']  == 'K01574']
Out[19]:
Gene DNA RNA KO_ID Category1 Category2 Category3 description
17157 TOBG-MED-731_1009 147.835516 17364.870275 K01574 Metabolism Carbohydrate metabolism 00640 Propanoate metabolism [PATH:ko00640] adc; acetoacetate decarboxylase [EC:4.1.1.4]
17158 TOBG-MED-731_1009 147.835516 17364.870275 K01574 Metabolism Lipid metabolism 00072 Synthesis and degradation of ketone bodi... adc; acetoacetate decarboxylase [EC:4.1.1.4]
17159 TOBG-MED-731_987 147.857693 1771.744287 K01574 Metabolism Carbohydrate metabolism 00640 Propanoate metabolism [PATH:ko00640] adc; acetoacetate decarboxylase [EC:4.1.1.4]
17160 TOBG-MED-731_987 147.857693 1771.744287 K01574 Metabolism Lipid metabolism 00072 Synthesis and degradation of ketone bodi... adc; acetoacetate decarboxylase [EC:4.1.1.4]
In [ ]:
## To make a web-based application using plotly?